library(MultiAssayExperiment)
library(knitr)
library(readxl)
library(ggplot2)
library(dplyr)
library(tidyr)
library(ggvenn)
Data Overview:
Patient and Treatment Details:
Additional Information:
In this section, we will load the multi-assay .rds file,
extract relevant clinical, expression, and annotation data, and prepare
the expression data for analysis by normalizing it using quantile
normalization. We will also integrate the radiomics data for further
analysis.
# Load your multiassay result and extract clinical data , expression data and annotation
#load mae obj
mae <- readRDS("~/BHK lab/I-SPY-Project/Source Data/ICB_Wolf.rds")
#extract Clinical data
clin <- data.frame(colData(mae)) # Dim 987 x 86
# Display only the first 10 rows of the 'clin' dataset
DT::datatable(clin, options = list(scrollX = TRUE, pageLength = 10), caption = "Genomics clinical data")
# Extract the expression data(Microarray)
expr <- assays(mae)[["expr"]] # Dim 18348 x 987
#extracting the annotation
annot <- data.frame(rowData(mae@ExperimentList$expr))
# Display few rows of the microarray
DT::datatable(expr[1:8, 1:4])
DT::datatable(annot, options = list(scrollX = TRUE, pageLength = 10), caption = "Annotation of Microarray genes")
We have metadata from
ISPY2-Imaging-Cohort-1-Clinical-Data.xlsx extracted from The Cancer
Imaging Archive - ISPY2 Collection. This clinical data includes 985
cases (719 I-SPY2 cases plus 266 ACRIN-6698
cases).
The radiomics data from the I-SPY2 Trial includes dynamic contrast-enhanced MRI (DCE-MRI) scans performed on patients before and during neoadjuvant chemotherapy. This dataset includes MRI exams of 719 patients with 4 scans per patient, designed to evaluate tumor response to treatment based on functional tumor volume (FTV) and other features.
Note: Each MRI scan could involve several segmentations depending on the analysis, such as for functional tumor volume (FTV), longest diameter, and sphericity measurements. A retrospective study within the trial involved a subset of 384 patients where multiple MRI features were segmented and analyzed
The I-SPY2 Imaging Cohort 1 consists of MRI data for 985 patients, combining data from the I-SPY2 trial and the ACRIN 6698 study. These MRI data were acquired at multiple time points across more than 22 clinical centers.
This analysis will allow us to better understand the overlap between genomics and radiomics data and facilitate integrated analyses.
# read the Radiomics clincal data
Radiomics_clin <- read_excel("~/BHK lab/I-SPY-Project/Source Data/ISPY2-Imaging-Cohort-1-Clinical-Data.xlsx") # dim 985 x 10
DT::datatable(Radiomics_clin, options = list(scrollX = TRUE, pageLength = 10), caption = "Radiomics clinical dataframe")
# Rename clin to genomic
Genomics_clin <- clin # dim 987 x 86
# Remove the "X" prefix from patient IDs
Genomics_clin$patientid <- sub("^X", "", Genomics_clin$patientid)
rownames(Genomics_clin) <- Genomics_clin$patientid
colnames(expr) <- sub("^X", "", colnames(expr))
# Intersection of patient IDs between Genomics_clin and Radiomics_clin
common_patients <- intersect(Genomics_clin$patientid, Radiomics_clin$Patient_ID) #982 common patients
# Print the result in a nice format
cat("The number of patients with both microarray and MRI data is:", length(common_patients), "\n")
## The number of patients with both microarray and MRI data is: 982
# Print the result in a nice format
cat("The number of patients with both microarray and MRI data is:", length(common_patients), "\n")
## The number of patients with both microarray and MRI data is: 982
venn_data <- list(
Genomics = Genomics_clin$patientid,
Radiomics = Radiomics_clin$Patient_ID
)
# Create Venn diagram
ggvenn(venn_data, fill_color = c("#A6CEE3", "#FB9A99"), stroke_size = 0.5, set_name_size = 4) +
ggtitle("Overlap of Genomics (Microarray) and Radiomics (MRI) Patient IDs") +
theme(
plot.title = element_text(hjust = 0.5, size = 12, face = "bold"),
plot.background = element_rect(fill = "white"),
panel.background = element_blank()
)
## Merging Genomics and Radiomics Data based on Patient IDs
We merge the Genomics_clin and
Radiomics_clin datasets based on common patient IDs,
ensuring that radiomics columns are prefixed with “Radiomics_” for
tracking purposes. After the merge, the data will be saved as a CSV file
for further analysis.
# Add "Radiomics_" prefix to the column names of Radiomics_clin (except patient ID)
colnames(Radiomics_clin)[-which(colnames(Radiomics_clin) == "Patient_ID")] <-
paste0("Radiomics_", colnames(Radiomics_clin)[-which(colnames(Radiomics_clin) == "Patient_ID")])
# Merge the Genomics and Radiomics data by patient ID
merged_clin <- merge(Genomics_clin, Radiomics_clin, by.x = "patientid", by.y = "Patient_ID")
# Order the merged data by patient ID
merged_clin <- merged_clin[order(merged_clin$patientid), ]
DT::datatable(merged_clin, options = list(scrollX = TRUE, pageLength = 10), caption = "Merged clinical data: including Genomics and Radiomics clin data by patient ID")
# Save the merged dataframe as a CSV file
#write.csv(merged_clin, "~/BHK lab/I-SPY-Project/output Data/Merged_clin_GenomicandRadoiomic.csv", row.names = FALSE)
We aim to ensure that each patient has the expected segmentation images, specifically 4 segmentation masks related to their Dynamic Contrast Enhanced (DCE) MRI scans. This information is crucial for the Functional Tumor Volume (FTV) analysis, part of the I-SPY clinical trials.
The segmentation process includes four or five distinct masking steps applied during the DCE MRI acquisition. These steps include:
The DICOM masks are stored as 2D derived images that align with the original DCE MRI acquisitions. These masks are available for download through the UCSF Box site here, and they represent different time points (T1, T2, T3, and T4) during treatment.
ispy1_dce_series_info.xlsxThe file ispy1_dce_series_info.xlsx (sheet:
DCE Series, All 20160823) lists all original DCE series in
the I-SPY 1 collection and provides detailed metadata for each patient’s
DCE MRI series, including:
By analyzing this data, we can validate that each patient has 4 segmentation masks, corresponding to the four time points.
results: 221 patienst have segmanations ,Each patient from is expected to have at least 4 segmentation masks across the four study time points (T1-T4). After reviewing the series information in the provided DICOM metadata and segmentation mask files, all patients, except for 24, have at least 4 segmentation file.
# read ispy1_dce_series_info.xlsx` (sheet: `DCE Series, All 20160823`)
ispy1_dce_series_info <- read_excel("~/BHK Lab/I-SPY-Project/Source Data/ispy1_dce_series_info.xlsx", sheet = "DCE Series, All 20160823") #dim 1473 x 37
# display DCE Series info
DT::datatable(ispy1_dce_series_info, options = list(scrollX = TRUE, pageLength = 10), caption = "Lists all original DCE series in the I-SPY 1 collection info per patient's DCE MRI series")
# Count the number of series per patient for each time point and calculate the total number of segmentation
series_per_patient <- ispy1_dce_series_info %>%
group_by(PATIENT_ID, TIME_POINT) %>%
summarize(Series_Count = n(), .groups = 'drop') %>%
pivot_wider(names_from = TIME_POINT, values_from = Series_Count, values_fill = 0) %>%
mutate(across(starts_with("T"), as.numeric), # Ensure all time point columns are numeric
Total_Segmentations = rowSums(select(., starts_with("T")), na.rm = TRUE))
# display DCE Series info
DT::datatable(ispy1_dce_series_info , options = list(scrollX = TRUE, pageLength = 10), caption = "Table: Number of Series per Patient at Each Time Point amd total numbers of segmentation")
# Filter for patients with less than 4 segmentation
patients_less_than_4 <- series_per_patient %>%
filter(Total_Segmentations < 4) %>%
select(PATIENT_ID, Total_Segmentations)
DT::datatable(patients_less_than_4, options = list(scrollX = TRUE, pageLength = 10), caption = "Table: Patients with Less Than 4 Values")
# # read ispy1_dce_series_info.xlsx` (sheet: "All Series, All exams 20160720")
# ispy1_all_series_info <- read_excel("~/BHK Lab/I-SPY-Project/Source Data/ispy1_all_series_info.xlsx", sheet = "All Series, All exams 20160720") #dim 7854 x 11
# ispy1_segmentaions_info <- ispy1_all_series_info[ispy1_all_series_info$SERIES_CAT %in% "segmentation",] # dim 1416 X 11
#
# ispy1_segmentaions_per_patient <- ispy1_segmentaions_info %>%
# group_by(PATIENT_ID, TIME_POINT) %>%
# summarize(Series_Count = n(), .groups = 'drop') %>%
# pivot_wider(names_from = TIME_POINT, values_from = Series_Count, values_fill = 0) %>%
# mutate(across(starts_with("T"), as.numeric), # Ensure all time point columns are numeric
# Total_Segmentations = rowSums(select(., starts_with("T")), na.rm = TRUE))